Employee attrition is an issue that has been puzzling the Human Resource Managers of various companies for a long time. In this project, we try to analyse what factors lead to employee retention in companies, and what factors influence them the most. We use a dataset that is published by the Human Resource department of IBM.
Poses a problem to be solved:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import iplot
import scipy
from scipy import stats
from sklearn.preprocessing import LabelEncoder
import random
Data=pd.read_csv("Data.csv")
print("Feature :",Data.shape[0])
print("Row :",Data.shape[1])
Data.columns # print feature
Data.head(5)
COMMENT:
=> EmployeeCount and Over18 does not affect the target ,feature other
Data_New=Data.drop(columns=["Over18","EmployeeCount"])
Data.dtypes # print type data of each feature
COMMENT:
from IPython.display import Image
Image(filename='missingvalue.png')
Data.isnull().any()
=> No data is missing
Data_New.describe(percentiles=[0.01, 0.25,0.75, 0.99])
COMMENT:
fig = make_subplots(rows=1, cols=2,
specs=[[{"type": "bar"}, {"type": "domain"}]])
trace0 = go.Histogram(x=Data['Attrition'], name='In number', marker={'color':['red', 'blue']},
showlegend=False)
fig.append_trace(trace0, 1, 1)
fig.update(layout_title_text='<b> Attrition </b>')
fig.update_layout(showlegend=False)
fig.show()
COMMENT: The number of employees who do not agree to quit their job is much higher than that who quit
sns.distplot(Data['Age'])
has the same form normal distribution
#Similarly we can do this for all the numerical features. Below I have plotted the subplots for the other features.
fig,ax = plt.subplots(5,2, figsize=(9,9))
sns.distplot(Data['TotalWorkingYears'], ax = ax[0,0])
sns.distplot(Data['MonthlyIncome'], ax = ax[0,1])
sns.distplot(Data['YearsAtCompany'], ax = ax[1,0])
sns.distplot(Data['DistanceFromHome'], ax = ax[1,1])
sns.distplot(Data['YearsInCurrentRole'], ax = ax[2,0])
sns.distplot(Data['YearsWithCurrManager'], ax = ax[2,1])
sns.distplot(Data['YearsSinceLastPromotion'], ax = ax[3,0])
sns.distplot(Data['PercentSalaryHike'], ax = ax[3,1])
sns.distplot(Data['YearsSinceLastPromotion'], ax = ax[4,0])
sns.distplot(Data['TrainingTimesLastYear'], ax = ax[4,1])
plt.tight_layout()
plt.show()
Convert Attrition to binary to consider the correlation
# lấy row có thuộc tính ==Yes
Data_binary=pd.read_csv("Data.csv")
row=list(Data_binary[Data_binary["Attrition"]=="Yes"].index)
#print(row)
#chuyển tất cả các data của target =1
Data_binary["Attrition"]=1
#chuyển các data của target có thuoc_tinh0=0
for r in row:
Data_binary["Attrition"].loc[r]=0
#print(Data_binary)
Corr_data=Data_binary.corr()
f , ax = plt.subplots(figsize=(20,12))
sns.heatmap(Corr_data,vmax=1, annot=True)
The higher the value, the more likely it is key factor of attrition
BarPlot_columns=["TotalWorkingYears","JobLevel","YearsInCurrentRole","YearsWithCurrManager"]
def Bar_plots(var):
col=pd.crosstab(Data[var],Data.Attrition)
col.div(col.sum(1).astype(float), axis=0).plot(kind="bar", stacked=False, figsize=(8,4))
plt.xticks(rotation=90)
for col in BarPlot_columns:
Bar_plots(col)
# Age -- Attrition
sns.catplot(x='Attrition', y='Age', data=Data,
kind='box')
mean of "Yes" lower "No"
# Graph the relationship between Department and attrition
fig = make_subplots(rows=3, cols=2,
specs=[[{'rowspan':3}, {"type": "domain"}], # type miền
[None, {"type": "domain"}],
[None, {"type": "domain"}]])
labels = ['R&D', 'Sales', 'HR']
yes = Data['Department'][Data.Attrition=='Yes'].value_counts()
trace_yes = go.Bar(x=labels, y=-yes, marker={'color':'red'}, showlegend=False)
no = Data['Department'][Data.Attrition=='No'].value_counts()
trace_no = go.Bar(x=labels, y=no, marker={'color':'blue'}, showlegend=False )
## Pie 1 -- upper right
RD = Data['Attrition'][Data.Department=='Research & Development'].value_counts()
trace_3 = go.Pie(labels=['No', 'Yes'], values=RD, name='RD')
## Pie 2
Sales = Data['Attrition'][Data.Department=='Sales'].value_counts()
trace_4 = go.Pie(labels=['No', 'Yes'], values=Sales, name='Sales')
## Pie 3
HR = Data['Attrition'][Data.Department=='Human Resources'].value_counts()
trace_5 = go.Pie(labels=['No', 'Yes'], values=HR, name='HR')
# Add traces
fig.append_trace(trace_yes, 1, 1)
fig.append_trace(trace_no, 1, 1)
fig.append_trace(trace_3, 1, 2)
fig.append_trace(trace_4, 2, 2)
fig.append_trace(trace_5, 3, 2)
# Customize
fig.update(layout_title_text='<b> Attrition by Department </b>')
# Done
fig.show()
Value % different => is key factor
Data['Income']=pd.cut(Data['MonthlyIncome'],[1000,5000,10000,15000,20000])
f, ax = plt.subplots(figsize=(15, 4))
sns.countplot(y='Income',hue='Attrition',data=Data).set_title('Employee Salary Attrition Distribution')
plt.plot()
def number_or_category(Data_test):
try:
try_=Data_test+1
except:
Data_type="category"
else:
Data_type="number"
return (Data_type)
# target have type là number and feature have type category
def Xuly_T_test(Data,feature,target,STT,HT):
# Tính số thuộc tính trong feature example BusinessTravel have 3 tt là Non-Travel,Travel Rately and Travel Frequently
feature_set=set(Data[feature]) # convert set{} to remove thuộc tính same
#print(feature_set)
so_thuoc_tinh_feature=len(feature_set)
#print(so_thuoc_tinh_feature)
if so_thuoc_tinh_feature==1:
Not_key.append(feature)
else:
# tách data của target theo từng thuộc tính trong feature
sample=[]
for thuoc_tinh in feature_set:
row=(Data[Data[feature]==thuoc_tinh].index)
po_target=list(Data[target][row])
sample.append(po_target)
#print(sample)
if len(sample) ==2 :
f,p=stats.ttest_ind(sample[0],sample[1])
if p< 0.05:
P_value.append(str((1-p)*100))
if HT ==1:
Key_factor.append(target)
else:
Key_factor.append(feature)
else :
if HT ==1:
Not_key.append(target)
else:
Not_key.append(feature)
if len(sample) > 2: # có nhiều hơn 2 thuộc tính xét các cặp nếu 1 cặp khác nhau thì khác nhau
khac=0
for i in range(len(sample)-1):
f,p=stats. f_oneway(sample[i],sample[i+1])
if p <0.05:
P_value.append(str((1-p)*100))
khac=1
break
if khac==1:
if HT ==1:
Key_factor.append(target)
else:
Key_factor.append(feature)
else :
if HT ==1:
Not_key.append(target)
else:
Not_key.append(feature)
# Dùng Correlation để xử lý target là number và feature là number
def Xuly_Correlation(Data,feature,target,STT):
# lấy random 50 data của feature và của target
sample_feature=random.sample(list(Data[feature]),k=50)
sample_target=random.sample(list(Data[target]),k=50)
r,p=stats.pearsonr(sample_feature, sample_target)
#print(r,p)
if abs(r) >= 0.7 :
Key_factor.append(feature)
else:
Not_key.append(feature)
# input data type file.csv
Data=pd.read_csv("Data.csv")
#print(Data.shape)
#print(Data.head(1))
def main_(Data,target):
target_type=number_or_category(Data[target][0])
#print("Target có dạng ",target_type)
STT=0
for col in range(Data.shape[1]):
if Data.columns[col] !="Attrition" and Data.columns[col] !="JobSatisfaction" :
feature=Data.columns[col]
feature_type=number_or_category(Data.iloc[0][col])
#print(feature_type)
if target_type=="number" and feature_type == "number":
pass #Xuly_Correlation(Data,feature,target,STT)
if target_type=="number" and feature_type == "category":
Xuly_T_test(Data,feature,target,STT,0)
if target_type=="category" and feature_type == "number":
#Reverve H0
Xuly_T_test(Data,target,feature,STT,1)
if target_type=="category" and feature_type == "category":
# convert target to binary
#thuộc tính đầu tiên của target đưa về 0 còn lại đưa về 1(với attrition có 2 thuộc tính là yes và no)
#nếu chỉ có 1 thuộc tính thì đều chuyển =0 > sẽ không ảnh hưởng và p=nan %
target_set=list(set(Data[target]))
thuoc_tinh0=target_set[0]
# lấy row có thuộc tính ==thuoc_tinh0
row=list(Data[Data[target]==thuoc_tinh0].index)
#print(row)
Data_binary=Data
#print(Data_binary)
#chuyển tất cả các data của target =1
Data_binary[target]=1
#chuyển các data của target có thuoc_tinh0=0
for r in row:
Data_binary[target].loc[r]=0
#print(Data_binary)
# target có dạng number và feature có dang category
Xuly_T_test(Data_binary,feature,target,STT,0)
STT+=1
Key_factor=[]
P_value=[]
Not_key=[]
print("What are key factors that are playing into current attrition Rate ?")
main_(Data,"Attrition")
#print(Key_factor)
#print(P_value)
# Create tabel
table1 = pd.DataFrame()
table1["Is Key Factor"]=Key_factor
table1["Reliability(%)"]=P_value
print(table1)
OverTime =100% => Employees who do not accept overtime will leave
Convert JobSatisfaction to category
Data_c=Data
Data_c["JobSatisfaction"]=Data_c["JobSatisfaction"].replace(1,"TooLow")
Data_c["JobSatisfaction"]=Data_c["JobSatisfaction"].replace(2,"Low")
Data_c["JobSatisfaction"]=Data_c["JobSatisfaction"].replace(3,"High")
Data_c["JobSatisfaction"]=Data_c["JobSatisfaction"].replace(4,"TooHigh")
print(Data_c["JobSatisfaction"])
Key_factor=[]
P_value=[]
Not_key=[]
print("What are key factors that are playing into current JobSatisfaction Rate ?")
main_(Data,"JobSatisfaction")
#print(Key_factor)
#print(P_value)
# Create tabel
table2 = pd.DataFrame()
table2["Is Key Factor"]=Key_factor
table2["Reliability(%)"]=P_value
print(table2)
1.Key factors to Attrition.
2.Key factor to JobSactisfaction